we investigate the effect of diet composition on different measures of obesity.
Obesity can be measured in multiple ways
But each of these may categorise obesity differently.
In terms of diet composition, we look at
we then want to see what is the effect of each food type on a measurement for obesity (whether it is significant or not), and compare how this varies across each obesity measurement.
e.g. Maybe meat servings is a significant contributor to BMI but not to waist circumference.
Then, we can compare to see the effect of each food type across different obesity measures. Examples
e.g. Does fruit servings have a bigger effect on BMI, waist to height ratio, or waist circumference which of the food types has the biggest contribution to BMI reduction/gain.
We can do this by calculating and comparing the standardized betas.
(we can adjust for obvious contributors to each measurement such as age and sex).
# selecting macros
foods = tech_nutr %>% dplyr::select(ABSPID,
VEGLEG1N,
VEGLEG2N,
FRUIT1N,
FRUIT2N,
DAIRY1N,
DAIRY2N,
MEAT1N,
MEAT2N,
GRAINS1N,
GRAINS2N)
# gettin average of macros
avg_veges <- rowMeans(foods[ , c(2,3)], na.rm=TRUE)
avg_fruit <- rowMeans(foods[ , c(4,5)], na.rm=TRUE)
avg_dairy <- rowMeans(foods[ , c(6,7)], na.rm=TRUE)
avg_meat <- rowMeans(foods[ , c(8,9)], na.rm=TRUE)
avg_grains <- rowMeans(foods[ , c(10,11)], na.rm=TRUE)
dat <- cbind(tech_nutr$ABSPID,
avg_veges,
avg_fruit,
avg_dairy,
avg_meat,
avg_grains)
dat <- as_tibble(dat)
colnames(dat)[1] <- "ABSPID"
dat$avg_veges <- as.numeric(dat$avg_veges)
dat$avg_fruit <- as.numeric(dat$avg_fruit)
dat$avg_dairy <- as.numeric(dat$avg_dairy)
dat$avg_meat <- as.numeric(dat$avg_meat)
dat$avg_grains <- as.numeric(dat$avg_grains)
tech_biom1 = tech_biom %>% dplyr::select(c(1:53))
final = merge(dat, tech_biom1, by = "ABSPID")
final = final %>% dplyr::select(avg_veges,
avg_fruit,
avg_dairy,
avg_meat,
avg_grains,
BMISC,
AGEC,
SEX,
PHDKGWBC,
PHDCMHBC,
PHDCMWBC,
SF2SA1QN)
final$w2hratio = (final$PHDCMWBC)/(final$PHDCMHBC)
# to make results more robust, focus on adults only
final <- final %>% filter(AGEC > 17)
final = final %>% na.omit()
numeric_hist <- function(data, x) {
ggplot(data, aes_string(x = `x`)) +
geom_histogram()
}
numeric_hist(final, x = "BMISC")
numeric_hist(final, x = "avg_dairy")
numeric_hist(final, x = "avg_fruit")
numeric_hist(final, x = "avg_meat")
numeric_hist(final, x = "avg_grains")
numeric_hist(final, x = "avg_veges")
numeric_hist(final, x = "AGEC")
bmi_full <- lm(BMISC ~ avg_veges + avg_fruit + avg_dairy + avg_meat + avg_grains + AGEC + SEX, dat = final)
bmi_null <- lm(BMISC ~ 1, dat = final)
w2hratio_full <- lm(w2hratio ~ avg_veges + avg_fruit + avg_dairy + avg_meat + avg_grains + AGEC + SEX, dat = final)
w2hratio_null <- lm(w2hratio ~ 1, dat = final)
waist_full <- lm(PHDCMWBC ~ avg_veges + avg_fruit + avg_dairy + avg_meat + avg_grains + AGEC + SEX, dat = final)
waist_null <- lm(PHDCMWBC ~ 1, dat = final)
NOTE: robust regression was attempted and gave similar results to OLS, so just using OLS instead.
tab_model(bmi_full, waist_full, w2hratio_full, show.ci = F, show.se = T,
dv.labels = c("BMI", "Waist circumf.", "waist to height ratio"))
| BMI | Waist circumf. | waist to height ratio | |||||||
|---|---|---|---|---|---|---|---|---|---|
| Predictors | Estimates | std. Error | p | Estimates | std. Error | p | Estimates | std. Error | p |
| (Intercept) | 25.98 | 0.24 | <0.001 | 89.07 | 0.59 | <0.001 | 0.49 | 0.00 | <0.001 |
| avg veges | -0.12 | 0.03 | <0.001 | -0.29 | 0.08 | <0.001 | -0.00 | 0.00 | <0.001 |
| avg fruit | -0.20 | 0.05 | <0.001 | -0.57 | 0.11 | <0.001 | -0.00 | 0.00 | <0.001 |
| avg dairy | 0.10 | 0.07 | 0.141 | 0.18 | 0.16 | 0.255 | -0.00 | 0.00 | 0.275 |
| avg meat | 0.22 | 0.05 | <0.001 | 0.37 | 0.12 | 0.002 | 0.00 | 0.00 | 0.007 |
| avg grains | -0.17 | 0.03 | <0.001 | -0.38 | 0.07 | <0.001 | -0.00 | 0.00 | <0.001 |
| AGEC | 0.05 | 0.00 | <0.001 | 0.23 | 0.01 | <0.001 | 0.00 | 0.00 | <0.001 |
| SEX [2] | -0.66 | 0.13 | <0.001 | -10.08 | 0.31 | <0.001 | -0.02 | 0.00 | <0.001 |
| Observations | 7811 | 7811 | 7811 | ||||||
| R2 / R2 adjusted | 0.041 / 0.040 | 0.190 / 0.190 | 0.155 / 0.154 | ||||||
dairy is insignificant for all
Model selection using backward and forward selection
n = 9979
# bmi_AIC is just full model
# bmi_AIC <- step(bmi_full, direction = "backward", trace = F)
bmi_BIC <- step(bmi_full, direction = "backward", trace = F, k = log(n))
# fwd AIC model same as back
# step(bmi_null, scope = list(lower = bmi_null, upper = bmi_full), direction = "forward", trace = F)
waist_AIC <- step(waist_full, direction = "backward", trace = F)
# waist_BIC same as AIC
# waist_BIC <- step(waist_full, direction = "backward", trace = F, k = log(n))
# backward selection same as fwd
# step(waist_null, scope = list(lower = waist_null, upper = waist_full), direction = "forward", trace = F)
w2hratio_AIC <- step(w2hratio_full, direction = "backward", trace = F)
w2hratio_BIC <- step(w2hratio_full, direction = "backward", trace = F, k = log(n))
# backward selection same as fwd
# step(w2hratio_null, scope = list(lower = w2hratio_null, upper = waist_full), direction = "forward", trace = F)
params = trainControl(method = "cv", number = 10, verboseIter = FALSE)
set.seed(2021)
cv_objects = list(
bmi_full = train(BMISC ~ avg_veges + avg_fruit + avg_dairy + avg_meat +
avg_grains + AGEC + SEX,
method = "lm",
data = final,
trControl = params),
bmi_BIC = train(BMISC ~ avg_veges + avg_fruit + avg_meat + avg_grains +
AGEC + SEX,
method = "lm",
data = final,
trControl = params),
waist_full = train(PHDCMWBC ~ avg_veges + avg_fruit + avg_dairy + avg_meat + avg_grains + AGEC + SEX,
method = "lm",
data = final,
trControl = params),
waist_AIC = train(PHDCMWBC ~ avg_veges + avg_fruit + avg_meat + avg_grains + AGEC + SEX,
method = "lm",
data = final,
trControl = params),
w2hratio_full = train(w2hratio ~ avg_veges + avg_fruit + avg_dairy + avg_meat + avg_grains + AGEC + SEX,
method = "lm",
data = final,
trControl = params),
w2hratio_AIC = train(w2hratio ~ avg_veges + avg_fruit + avg_meat + avg_grains + AGEC + SEX,
method = "lm",
data = final,
trControl = params),
w2hratio_BIC = train(w2hratio ~ avg_veges + avg_fruit + avg_grains +
AGEC + SEX,
method = "lm",
data = final,
trControl = params)
)
cv_results_bmi = resamples(cv_objects[1:2], metric = "Rsquared")
ggplot(cv_results_bmi) +
theme_bw() +
labs(x = "Models", y = "Mean Absolute Error", title = "10-Fold CV Performance")
cv_results_waist = resamples(cv_objects[3:4], metric = "Rsquared")
ggplot(cv_results_waist) +
theme_bw() +
labs(x = "Models", y = "Mean Absolute Error", title = "10-Fold CV Performance")
cv_results_w2hratio = resamples(cv_objects[5:7], metric = "Rsquared")
ggplot(cv_results_w2hratio) +
theme_bw() +
labs(x = "Models", y = "Mean Absolute Error", title = "10-Fold CV Performance")
bmi_full <- lm(BMISC ~ avg_veges + avg_fruit + avg_dairy + avg_meat +
avg_grains + AGEC + SEX,
method = "lm",
data = final,
trControl = params)
summary(bmi_full)
##
## Call:
## lm(formula = BMISC ~ avg_veges + avg_fruit + avg_dairy + avg_meat +
## avg_grains + AGEC + SEX, data = final, method = "lm", trControl = params)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.835 -3.793 -0.854 2.789 35.088
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.975387 0.243339 106.746 < 2e-16 ***
## avg_veges -0.124598 0.032734 -3.806 0.000142 ***
## avg_fruit -0.198827 0.046527 -4.273 1.95e-05 ***
## avg_dairy 0.097887 0.066491 1.472 0.141009
## avg_meat 0.216046 0.049438 4.370 1.26e-05 ***
## avg_grains -0.174983 0.028362 -6.170 7.19e-10 ***
## AGEC 0.053893 0.003592 15.005 < 2e-16 ***
## SEX2 -0.663969 0.127662 -5.201 2.03e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.445 on 7803 degrees of freedom
## Multiple R-squared: 0.04127, Adjusted R-squared: 0.04041
## F-statistic: 47.99 on 7 and 7803 DF, p-value: < 2.2e-16
model.diag.metrics <- augment(bmi_full)
head(model.diag.metrics)
## # A tibble: 6 × 15
## .rownames BMISC avg_veges avg_fruit avg_dairy avg_meat avg_grains AGEC SEX
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <fct>
## 1 1 20.9 3.62 0.412 2.51 4.96 6.11 46 2
## 2 2 29.8 2.95 2.03 1.44 2.19 3.45 77 1
## 3 3 24.4 9.08 1.34 1.87 3.04 2.45 55 2
## 4 4 21.4 5.87 5.06 3.64 2.79 2.35 59 2
## 5 6 25.1 4.54 0.154 1.59 1.56 1.67 35 2
## 6 8 30.7 0.706 0 1.77 1.59 3.56 35 1
## # … with 6 more variables: .fitted <dbl>, .resid <dbl>, .hat <dbl>,
## # .sigma <dbl>, .cooksd <dbl>, .std.resid <dbl>
plot1 <- ggplot(model.diag.metrics, aes(avg_meat, BMISC)) +
geom_point() +
stat_smooth(method = lm, se = FALSE) +
geom_segment(aes(xend = avg_meat, yend = .fitted), color = "red", size = 0.3)
plot2 <- ggplot(model.diag.metrics, aes(avg_dairy, BMISC)) +
geom_point() +
stat_smooth(method = lm, se = FALSE) +
geom_segment(aes(xend = avg_dairy, yend = .fitted), color = "red", size = 0.3)
plot3 <- ggplot(model.diag.metrics, aes(avg_grains, BMISC)) +
geom_point() +
stat_smooth(method = lm, se = FALSE) +
geom_segment(aes(xend = avg_grains, yend = .fitted), color = "red", size = 0.3)
plot4 <- ggplot(model.diag.metrics, aes(avg_veges, BMISC)) +
geom_point() +
stat_smooth(method = lm, se = FALSE) +
geom_segment(aes(xend = avg_veges, yend = .fitted), color = "red", size = 0.3)
grid.arrange(plot1, plot2, plot3, plot4, ncol=2)
The residuals mostly seem to follow the same pattern across features
par(mfrow=c(2,2))
plot(bmi_full)
Residuals vs. fitted is relatively linear, indicating a linear relationship. Normal Q-Q plot, standardised residuals follows closely the normal line, indicating normality of residuals. Scale-location has no obvious non-linear pattern, indicating homoscedasticity. In the residuals vs. leverage graph a number of points are outside of Cook’s distance, indicating they are outliers. Some outlier effects may exist in the regression models specified above.
# creating scaled df
sex <- final$SEX
socioeconomic <- final$SF2SA1QN
scaled_df <- final %>%
dplyr::select(-c(SEX, SF2SA1QN)) %>%
scale() %>% as.data.frame()
scaled_df$SEX <- sex
scaled_df$SF2SA1QN <- socioeconomic
# lm scaled to get standardized betas to compare
bmi_scaled <- lm(BMISC ~ avg_veges + avg_fruit + avg_dairy + avg_meat + avg_grains + AGEC + SEX, dat = scaled_df)
w2h_scaled <- lm(w2hratio ~ avg_veges + avg_fruit + avg_dairy + avg_meat + avg_grains + AGEC + SEX, dat = scaled_df)
waist_scaled <- lm(PHDCMWBC ~ avg_veges + avg_fruit + avg_dairy + avg_meat + avg_grains + AGEC + SEX, dat = scaled_df)
tab_model(bmi_scaled, w2h_scaled, waist_scaled)
| BMISC | w 2 hratio | PHDCMWBC | |||||||
|---|---|---|---|---|---|---|---|---|---|
| Predictors | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p |
| (Intercept) | 0.06 | 0.03 – 0.09 | <0.001 | 0.09 | 0.06 – 0.12 | <0.001 | 0.36 | 0.33 – 0.39 | <0.001 |
| avg veges | -0.05 | -0.07 – -0.02 | <0.001 | -0.05 | -0.07 – -0.03 | <0.001 | -0.04 | -0.06 – -0.02 | <0.001 |
| avg fruit | -0.05 | -0.07 – -0.03 | <0.001 | -0.07 | -0.09 – -0.05 | <0.001 | -0.05 | -0.07 – -0.03 | <0.001 |
| avg dairy | 0.02 | -0.01 – 0.04 | 0.141 | -0.01 | -0.03 – 0.01 | 0.275 | 0.01 | -0.01 – 0.03 | 0.255 |
| avg meat | 0.05 | 0.03 – 0.08 | <0.001 | 0.03 | 0.01 – 0.05 | 0.007 | 0.03 | 0.01 – 0.06 | 0.002 |
| avg grains | -0.08 | -0.10 – -0.05 | <0.001 | -0.05 | -0.08 – -0.03 | <0.001 | -0.06 | -0.09 – -0.04 | <0.001 |
| AGEC | 0.17 | 0.15 – 0.19 | <0.001 | 0.38 | 0.36 – 0.40 | <0.001 | 0.28 | 0.26 – 0.30 | <0.001 |
| SEX [2] | -0.12 | -0.16 – -0.07 | <0.001 | -0.18 | -0.22 – -0.14 | <0.001 | -0.69 | -0.73 – -0.64 | <0.001 |
| Observations | 7811 | 7811 | 7811 | ||||||
| R2 / R2 adjusted | 0.041 / 0.040 | 0.155 / 0.154 | 0.190 / 0.190 | ||||||
meat has the biggest effect on BMI overall effect and associated with increased BMI. fruit and veges associated with reduced waist circumference and waist to height ratio.
food_groups <- final %>% dplyr::select(avg_dairy,
avg_fruit,
avg_grains,
avg_meat,
avg_veges)
numerics <- final %>% dplyr::select(-SEX,
-SF2SA1QN)
# principal component analysis
res.pca <- prcomp(
food_groups,
center = T,
scale = T
)
# extracting pc1 and pc2
pcs = res.pca$x[, c(1,2)] %>% as.data.frame()
## keeping original data
pcs1 <- pcs
## Creating k-means clustering model
fit_cluster_kmeans_pca <- kmeans(scale(pcs), 5)
# Assigning the result to the data used to create the tsne
pcs1$cl_kmeans <- factor(fit_cluster_kmeans_pca$cluster)
# Creating hierarchical cluster model
fit_cluster_hierarchical_pca <- hclust(dist(scale(pcs)), method = "ward.D2")
# Assigning the result to the data used
pcs1$cl_hierarchical <- factor(cutree(fit_cluster_hierarchical_pca, k=5))
# obesity measured via BMI >= 30
pcs1$BMISC <- final$BMISC
pcs1$obese_BMI <- pcs1$BMISC >= 30
# obesity measured via waist circumference
final$obese_waist_circum = NA
for (i in 1:nrow(final)) {
if (final$SEX[i] == "2") {
if (final$PHDCMWBC[i] > 88) {
final$obese_waist_circum[i] = 1
} else {
final$obese_waist_circum[i] = 0
}
} else if (final$SEX[i] == "1") {
if (final$PHDCMWBC[i] > 102) {
final$obese_waist_circum[i] = 1
} else {
final$obese_waist_circum[i] = 0
}
}
}
pcs1$obese_waist_circum <- final$obese_waist_circum
# obesity measured using w2hratio
final$obese_w2hratio = NA
for (i in 1:nrow(final)) {
if (final$SEX[i] == "2") {
if (final$w2hratio[i] >= 0.58) {
final$obese_w2hratio[i] = 1
} else {
final$obese_w2hratio[i] = 0
}
} else if (final$SEX[i] == "1") {
if (final$w2hratio[i] >= 0.63) {
final$obese_w2hratio[i] = 1
} else {
final$obese_w2hratio[i] = 0
}
}
}
pcs1$obese_w2hratio <- final$obese_w2hratio
data <- scale(pcs)
k.max <- 15
wss <- sapply(1:k.max,
function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss})
plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares", main="k-means elbow")
#### hierarchical
#get dendrogram
plot(fit_cluster_hierarchical_pca, hang = -1)
pcs1 %>%
ggplot(aes(x = PC1,
y = PC2))+
geom_point(aes(color=obese_BMI),
alpha = 0.5)+
scale_color_manual(values = c("#964B00", "#8F00FF", "#0000FF", "#00FF00",
"#FFA500", "#808080", "#FF0000")) +
geom_mark_ellipse(aes(color = cl_hierarchical,
label=cl_hierarchical),
expand = unit(0.5,"mm"),
label.buffer = unit(-5, 'mm')) +
theme(legend.position = "none") + ggtitle("Hierarchical, BMI")
pcs1 %>%
ggplot(aes(x = PC1,
y = PC2))+
geom_point(aes(color=factor(as.character(obese_waist_circum))),
alpha = 0.5)+
scale_color_manual(values = c("#964B00", "#8F00FF", "#0000FF", "#00FF00",
"#FFA500", "#808080", "#FF0000")) +
geom_mark_ellipse(aes(color = cl_hierarchical,
label=cl_hierarchical),
expand = unit(0.5,"mm"),
label.buffer = unit(-5, 'mm')) +
theme(legend.position = "none") + ggtitle("Hierarchical, waist circumference")
pcs1 %>%
ggplot(aes(x = PC1,
y = PC2))+
geom_point(aes(color=obese_w2hratio %>% as.character() %>% as.factor()),
alpha = 0.5)+
scale_color_manual(values = c("#964B00", "#8F00FF", "#0000FF", "#00FF00",
"#FFA500", "#808080", "#FF0000")) +
geom_mark_ellipse(aes(color = cl_hierarchical,
label=cl_hierarchical),
expand = unit(0.5,"mm"),
label.buffer = unit(-5, 'mm')) +
theme(legend.position = "none") + ggtitle("Hierarchical, waist to height ratio")
pcs1 %>% group_by(cl_hierarchical) %>%
summarise(prop_obese_BMI = sum(as.numeric(obese_BMI))/n())
## # A tibble: 5 × 2
## cl_hierarchical prop_obese_BMI
## <fct> <dbl>
## 1 1 0.279
## 2 2 0.256
## 3 3 0.208
## 4 4 0.301
## 5 5 0.249
pcs1 %>% group_by(cl_hierarchical) %>%
summarise(prop_obese_waist_circum = sum(as.numeric(obese_waist_circum))/n())
## # A tibble: 5 × 2
## cl_hierarchical prop_obese_waist_circum
## <fct> <dbl>
## 1 1 0.405
## 2 2 0.389
## 3 3 0.321
## 4 4 0.447
## 5 5 0.343
pcs1 %>% group_by(cl_hierarchical) %>%
summarise(prop_obese_w2hratio = sum(as.numeric(obese_w2hratio))/n())
## # A tibble: 5 × 2
## cl_hierarchical prop_obese_w2hratio
## <fct> <dbl>
## 1 1 0.244
## 2 2 0.257
## 3 3 0.170
## 4 4 0.301
## 5 5 0.207
scaled_numerics <- scale(numerics)
scaled_numerics <- as.data.frame(scaled_numerics)
scaled_numerics$cl_hierarchical <- pcs1$cl_hierarchical
scaled_numerics %>% group_by(cl_hierarchical) %>%
summarise(avg_veges = mean(avg_veges) %>% round(2),
avg_dairy = mean(avg_dairy) %>% round(2),
avg_fruit = mean(avg_fruit) %>% round(2),
avg_meat = mean(avg_meat) %>% round(2),
avg_grains = mean(avg_grains) %>% round(2),
avg_age = mean(AGEC) %>% round(2),
avg_bmi = mean(BMISC) %>% round(2),
avg_waist = mean(PHDCMWBC) %>% round(2),
avg_w2hratio = mean(w2hratio) %>% round(2))
## # A tibble: 5 × 10
## cl_hierarchical avg_veges avg_dairy avg_fruit avg_meat avg_grains avg_age
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 0.94 -0.3 0.15 1.02 0.06 0.08
## 2 2 -0.27 0.55 0.05 -0.33 0.26 -0.01
## 3 3 0.55 0.88 0.93 0.48 1.1 -0.02
## 4 4 -0.56 -0.55 -0.42 -0.55 -0.57 -0.02
## 5 5 -0.11 2.55 0.69 -0.22 1.36 -0.21
## # … with 3 more variables: avg_bmi <dbl>, avg_waist <dbl>, avg_w2hratio <dbl>
Table above shows how many standard deviations away from 0 each cluster is in each variable.
fit_cluster_kmeans_pca$centers
## PC1 PC2
## 1 0.1426913 -0.7748435
## 2 -1.6374273 1.6626483
## 3 0.9929414 0.1505057
## 4 -0.2778727 0.6534491
## 5 -1.3425852 -1.3430297
fit_cluster_kmeans_pca$withinss
## [1] 743.3057 852.6626 814.0225 1026.2231 1194.1863
We can see that the clusters have reasonably similar variance
pcs1 %>%
ggplot(aes(x = PC1,
y = PC2))+
geom_point(aes(color=obese_BMI),
alpha = 0.5)+
scale_color_manual(values = c("#964B00", "#8F00FF", "#0000FF", "#00FF00",
"#FFA500", "#808080", "#FF0000")) +
geom_mark_ellipse(aes(color = cl_kmeans,
label=cl_kmeans),
expand = unit(0.5,"mm"),
label.buffer = unit(-5, 'mm')) +
theme(legend.position = "none") + ggtitle("K-means, BMI")
pcs1 %>%
ggplot(aes(x = PC1,
y = PC2))+
geom_point(aes(color=factor(as.character(obese_waist_circum))),
alpha = 0.5)+
scale_color_manual(values = c("#964B00", "#8F00FF", "#0000FF", "#00FF00",
"#FFA500", "#808080", "#FF0000")) +
geom_mark_ellipse(aes(color = cl_kmeans,
label=cl_kmeans),
expand = unit(0.5,"mm"),
label.buffer = unit(-5, 'mm')) +
theme(legend.position = "none") + ggtitle("K-means, waist circumference")
pcs1 %>%
ggplot(aes(x = PC1,
y = PC2))+
geom_point(aes(color=obese_w2hratio %>% as.character() %>% as.factor()),
alpha = 0.5)+
scale_color_manual(values = c("#964B00", "#8F00FF", "#0000FF", "#00FF00",
"#FFA500", "#808080", "#FF0000")) +
geom_mark_ellipse(aes(color = cl_kmeans,
label=cl_kmeans),
expand = unit(0.5,"mm"),
label.buffer = unit(-5, 'mm')) +
theme(legend.position = "none") + ggtitle("K-means, waist to height ratio")
pcs1 %>% group_by(cl_kmeans) %>%
summarise(prop_obese_BMI = sum(as.numeric(obese_BMI))/n())
## # A tibble: 5 × 2
## cl_kmeans prop_obese_BMI
## <fct> <dbl>
## 1 1 0.272
## 2 2 0.260
## 3 3 0.297
## 4 4 0.273
## 5 5 0.231
pcs1 %>% group_by(cl_kmeans) %>%
summarise(prop_obese_waist_circum = sum(as.numeric(obese_waist_circum))/n())
## # A tibble: 5 × 2
## cl_kmeans prop_obese_waist_circum
## <fct> <dbl>
## 1 1 0.414
## 2 2 0.373
## 3 3 0.446
## 4 4 0.404
## 5 5 0.322
pcs1 %>% group_by(cl_kmeans) %>%
summarise(prop_obese_w2hratio = sum(as.numeric(obese_w2hratio))/n())
## # A tibble: 5 × 2
## cl_kmeans prop_obese_w2hratio
## <fct> <dbl>
## 1 1 0.279
## 2 2 0.210
## 3 3 0.298
## 4 4 0.249
## 5 5 0.179
scaled_numerics <- scale(numerics)
scaled_numerics <- as.data.frame(scaled_numerics)
scaled_numerics$cl_kmeans <- pcs1$cl_kmeans
scaled_numerics %>% group_by(cl_kmeans) %>%
summarise(avg_veges = mean(avg_veges) %>% round(2),
avg_dairy = mean(avg_dairy) %>% round(2),
avg_fruit = mean(avg_fruit) %>% round(2),
avg_meat = mean(avg_meat) %>% round(2),
avg_grains = mean(avg_grains) %>% round(2),
avg_age = mean(AGEC) %>% round(2),
avg_bmi = mean(BMISC) %>% round(2),
avg_waist = mean(PHDCMWBC) %>% round(2),
avg_w2hratio = mean(w2hratio) %>% round(2))
## # A tibble: 5 × 10
## cl_kmeans avg_veges avg_dairy avg_fruit avg_meat avg_grains avg_age avg_bmi
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 -0.42 0.46 -0.04 -0.48 0.14 -0.01 0.01
## 2 2 1.93 -0.08 0.52 1.89 0.54 0.02 -0.02
## 3 3 -0.6 -0.68 -0.5 -0.59 -0.69 -0.02 0.04
## 4 4 0.46 -0.27 0.07 0.53 -0.01 0.08 0.01
## 5 5 0.28 1.6 0.94 0.19 1.28 -0.11 -0.12
## # … with 2 more variables: avg_waist <dbl>, avg_w2hratio <dbl>